Data Preparation

# Importing necessary libraries
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.4     v dplyr   1.0.7
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   2.0.1     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(ggplot2)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(usmap)
## Warning: package 'usmap' was built under R version 4.1.2
library(mlr3)
## Warning: package 'mlr3' was built under R version 4.1.2
library(mlr3learners)
## Warning: package 'mlr3learners' was built under R version 4.1.2
library(mlr3pipelines)
## Warning: package 'mlr3pipelines' was built under R version 4.1.2
library(mlr3tuning)
## Loading required package: paradox
## Warning: package 'paradox' was built under R version 4.1.2
library(paradox)
# Reading the dataset

airports <- read.csv("airports.csv")
carriers <- read.csv("carriers.csv")
plane <- read.csv("plane-data.csv", na.strings = "")

flight_1 <- read.csv("2003.csv")
flight_2 <- read.csv("2004.csv")
# Row binding the flights in 2003 and 2004
flight <- rbind(flight_1,flight_2)
# Get a statistical summary for the binded flight data
summary(flight)
##       Year          Month          DayofMonth      DayOfWeek    
##  Min.   :2003   Min.   : 1.000   Min.   : 1.00   Min.   :1.000  
##  1st Qu.:2003   1st Qu.: 4.000   1st Qu.: 8.00   1st Qu.:2.000  
##  Median :2004   Median : 7.000   Median :16.00   Median :4.000  
##  Mean   :2004   Mean   : 6.538   Mean   :15.74   Mean   :3.937  
##  3rd Qu.:2004   3rd Qu.:10.000   3rd Qu.:23.00   3rd Qu.:6.000  
##  Max.   :2004   Max.   :12.000   Max.   :31.00   Max.   :7.000  
##                                                                 
##     DepTime         CRSDepTime      ArrTime         CRSArrTime  
##  Min.   :   1     Min.   :   0   Min.   :   1     Min.   :   0  
##  1st Qu.: 938     1st Qu.: 935   1st Qu.:1123     1st Qu.:1128  
##  Median :1330     Median :1330   Median :1524     Median :1527  
##  Mean   :1346     Mean   :1341   Mean   :1500     Mean   :1507  
##  3rd Qu.:1732     3rd Qu.:1725   3rd Qu.:1915     3rd Qu.:1911  
##  Max.   :2750     Max.   :2400   Max.   :2955     Max.   :2400  
##  NA's   :229226                  NA's   :254391                 
##  UniqueCarrier        FlightNum      TailNum          ActualElapsedTime
##  Length:13617810    Min.   :   1   Length:13617810    Min.   :-710.0   
##  Class :character   1st Qu.: 587   Class :character   1st Qu.:  72.0   
##  Mode  :character   Median :1408   Mode  :character   Median : 104.0   
##                     Mean   :2054                      Mean   : 122.9   
##                     3rd Qu.:2887                      3rd Qu.: 154.0   
##                     Max.   :9912                      Max.   :1777.0   
##                                                       NA's   :254392   
##  CRSElapsedTime      AirTime           ArrDelay           DepDelay       
##  Min.   : -85.0   Min.   :-3818.0   Min.   :-1302.00   Min.   :-1410.00  
##  1st Qu.:  74.0   1st Qu.:   53.0   1st Qu.:  -10.00   1st Qu.:   -4.00  
##  Median : 105.0   Median :   85.0   Median :   -2.00   Median :    0.00  
##  Mean   : 124.1   Mean   :  103.3   Mean   :    5.12   Mean   :    6.63  
##  3rd Qu.: 155.0   3rd Qu.:  137.0   3rd Qu.:    9.00   3rd Qu.:    4.00  
##  Max.   :1441.0   Max.   : 3508.0   Max.   : 1879.00   Max.   : 1882.00  
##  NA's   :1        NA's   :254391    NA's   :254392     NA's   :229226    
##     Origin              Dest              Distance          TaxiIn        
##  Length:13617810    Length:13617810    Min.   :   6.0   Min.   :   0.000  
##  Class :character   Class :character   1st Qu.: 304.0   1st Qu.:   4.000  
##  Mode  :character   Mode  :character   Median : 547.0   Median :   5.000  
##                                        Mean   : 712.6   Mean   :   7.469  
##                                        3rd Qu.: 944.0   3rd Qu.:   7.000  
##                                        Max.   :4962.0   Max.   :1495.000  
##                                                                           
##     TaxiOut          Cancelled       CancellationCode      Diverted       
##  Min.   :   0.00   Min.   :0.00000   Length:13617810    Min.   :0.000000  
##  1st Qu.:  10.00   1st Qu.:0.00000   Class :character   1st Qu.:0.000000  
##  Median :  13.00   Median :0.00000   Mode  :character   Median :0.000000  
##  Mean   :  15.42   Mean   :0.01683                      Mean   :0.001848  
##  3rd Qu.:  18.00   3rd Qu.:0.00000                      3rd Qu.:0.000000  
##  Max.   :3905.00   Max.   :1.00000                      Max.   :1.000000  
##                                                                           
##   CarrierDelay      WeatherDelay        NASDelay       SecurityDelay    
##  Min.   :   0.0    Min.   :   0.0    Min.   : -60.0    Min.   :  0      
##  1st Qu.:   0.0    1st Qu.:   0.0    1st Qu.:   0.0    1st Qu.:  0      
##  Median :   0.0    Median :   0.0    Median :   0.0    Median :  0      
##  Mean   :   2.5    Mean   :   0.6    Mean   :   3.3    Mean   :  0      
##  3rd Qu.:   0.0    3rd Qu.:   0.0    3rd Qu.:   0.0    3rd Qu.:  0      
##  Max.   :1879.0    Max.   :1230.0    Max.   :1385.0    Max.   :533      
##  NA's   :2672742   NA's   :2672742   NA's   :2672742   NA's   :2672742  
##  LateAircraftDelay
##  Min.   :   0.0   
##  1st Qu.:   0.0   
##  Median :   0.0   
##  Mean   :   3.2   
##  3rd Qu.:   0.0   
##  Max.   :1407.0   
##  NA's   :2672742
# Cleaning NAs

flight <- flight %>%
  drop_na(DepTime, ArrTime, ArrDelay)

summary(flight)
##       Year          Month          DayofMonth      DayOfWeek       DepTime    
##  Min.   :2003   Min.   : 1.000   Min.   : 1.00   Min.   :1.00   Min.   :   1  
##  1st Qu.:2003   1st Qu.: 4.000   1st Qu.: 8.00   1st Qu.:2.00   1st Qu.: 938  
##  Median :2004   Median : 7.000   Median :16.00   Median :4.00   Median :1330  
##  Mean   :2004   Mean   : 6.541   Mean   :15.75   Mean   :3.94   Mean   :1346  
##  3rd Qu.:2004   3rd Qu.:10.000   3rd Qu.:23.00   3rd Qu.:6.00   3rd Qu.:1732  
##  Max.   :2004   Max.   :12.000   Max.   :31.00   Max.   :7.00   Max.   :2750  
##                                                                               
##    CRSDepTime      ArrTime       CRSArrTime   UniqueCarrier        FlightNum   
##  Min.   :   0   Min.   :   1   Min.   :   0   Length:13363418    Min.   :   1  
##  1st Qu.: 935   1st Qu.:1123   1st Qu.:1128   Class :character   1st Qu.: 585  
##  Median :1328   Median :1524   Median :1526   Mode  :character   Median :1403  
##  Mean   :1340   Mean   :1500   Mean   :1506                      Mean   :2042  
##  3rd Qu.:1725   3rd Qu.:1915   3rd Qu.:1910                      3rd Qu.:2857  
##  Max.   :2400   Max.   :2955   Max.   :2400                      Max.   :9912  
##                                                                                
##    TailNum          ActualElapsedTime CRSElapsedTime      AirTime       
##  Length:13363418    Min.   :-710.0    Min.   : -32.0   Min.   :-3818.0  
##  Class :character   1st Qu.:  72.0    1st Qu.:  75.0   1st Qu.:   53.0  
##  Mode  :character   Median : 104.0    Median : 105.0   Median :   85.0  
##                     Mean   : 122.9    Mean   : 124.4   Mean   :  103.3  
##                     3rd Qu.: 154.0    3rd Qu.: 155.0   3rd Qu.:  137.0  
##                     Max.   :1777.0    Max.   :1441.0   Max.   : 3508.0  
##                                                                         
##     ArrDelay            DepDelay            Origin              Dest          
##  Min.   :-1302.000   Min.   :-1410.000   Length:13363418    Length:13363418   
##  1st Qu.:  -10.000   1st Qu.:   -4.000   Class :character   Class :character  
##  Median :   -2.000   Median :    0.000   Mode  :character   Mode  :character  
##  Mean   :    5.119   Mean   :    6.601                                        
##  3rd Qu.:    9.000   3rd Qu.:    4.000                                        
##  Max.   : 1879.000   Max.   : 1882.000                                        
##                                                                               
##     Distance        TaxiIn            TaxiOut          Cancelled
##  Min.   :   8   Min.   :   0.000   Min.   :   0.00   Min.   :0  
##  1st Qu.: 305   1st Qu.:   4.000   1st Qu.:  10.00   1st Qu.:0  
##  Median : 547   Median :   5.000   Median :  13.00   Median :0  
##  Mean   : 715   Mean   :   7.474   Mean   :  15.67   Mean   :0  
##  3rd Qu.: 946   3rd Qu.:   7.000   3rd Qu.:  18.00   3rd Qu.:0  
##  Max.   :4962   Max.   :1495.000   Max.   :3905.00   Max.   :0  
##                                                                 
##  CancellationCode      Diverted  CarrierDelay      WeatherDelay    
##  Length:13363418    Min.   :0   Min.   :   0.0    Min.   :   0.0   
##  Class :character   1st Qu.:0   1st Qu.:   0.0    1st Qu.:   0.0   
##  Mode  :character   Median :0   Median :   0.0    Median :   0.0   
##                     Mean   :0   Mean   :   2.6    Mean   :   0.7   
##                     3rd Qu.:0   3rd Qu.:   0.0    3rd Qu.:   0.0   
##                     Max.   :0   Max.   :1879.0    Max.   :1230.0   
##                                 NA's   :2619866   NA's   :2619866  
##     NASDelay       SecurityDelay     LateAircraftDelay
##  Min.   : -60.0    Min.   :  0       Min.   :   0.0   
##  1st Qu.:   0.0    1st Qu.:  0       1st Qu.:   0.0   
##  Median :   0.0    Median :  0       Median :   0.0   
##  Mean   :   3.4    Mean   :  0       Mean   :   3.2   
##  3rd Qu.:   0.0    3rd Qu.:  0       3rd Qu.:   0.0   
##  Max.   :1385.0    Max.   :533       Max.   :1407.0   
##  NA's   :2619866   NA's   :2619866   NA's   :2619866

Question 1

When is the best time of day, day of the week, and time of year to fly to minimise delays?

# Creating range of times
flight <- flight %>%
  mutate(
  new_dep = case_when(
    between(DepTime, 1, 300) ~ "12 AM - 3 AM",
    between(DepTime, 301, 600) ~ "3 AM - 6 AM",
    between(DepTime, 601, 900) ~ "6 AM - 9 AM",
    between(DepTime, 901, 1200) ~ "9 AM - 12 PM",
    between(DepTime, 1201, 1500) ~ "12 PM - 3 PM",
    between(DepTime, 1501, 1800) ~ "3 PM - 6 PM",
    between(DepTime, 1801, 2100) ~ "6 PM - 9 PM",
    between(DepTime, 2101, 2400) ~ "9 PM - 12 AM",
    DepTime > 2400 ~ "12 AM - 3 AM"
  ))
# Filtering, grouping, and finding an average of arrival delay and delay counts per time interval

best_time_of_day <- flight %>%
  filter(ArrDelay > 0) %>%
   group_by(new_dep) %>%
   summarise(avg_delay_in_mins = round(mean(ArrDelay),2),
             delay_counts = n()) %>%
  arrange(avg_delay_in_mins)

best_time_of_day$new_dep <- factor(best_time_of_day$new_dep, levels = c("12 AM - 3 AM","3 AM - 6 AM", "6 AM - 9 AM", "9 AM - 12 PM", "12 PM - 3 PM", "3 PM - 6 PM", "6 PM - 9 PM", "9 PM - 12 AM"))

best_time_of_day
## # A tibble: 8 x 3
##   new_dep      avg_delay_in_mins delay_counts
##   <fct>                    <dbl>        <int>
## 1 3 AM - 6 AM               12.5        50906
## 2 6 AM - 9 AM               15.1       807703
## 3 9 AM - 12 PM              20.0       963895
## 4 12 PM - 3 PM              23.0      1037391
## 5 3 PM - 6 PM               27.2      1155089
## 6 6 PM - 9 PM               34.3      1029275
## 7 9 PM - 12 AM              48.5       385752
## 8 12 AM - 3 AM              84.3        29772
# Viz Best time of Day
best_time_of_day %>%
  ggplot() +
  aes(x = new_dep,
      y = avg_delay_in_mins,
      fill = avg_delay_in_mins) +
  xlab(NULL) +
  ylab("Avg Delay (min)") +
  scale_fill_gradient(low = "#1DB0CA", high = "#232971") +
  labs(title = "Average Flight Delay in 2003-2004",
       subtitle = "(per 3 hours)",
       caption = "Figure 1") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
        plot.title = element_text(hjust = 0.5, face = "bold"),
        plot.subtitle = element_text(hjust = 0.5),
        plot.caption = element_text(hjust = 0.5),
        legend.position = "none") +
  geom_col() +
  geom_text(aes(label = avg_delay_in_mins), vjust = 1.4, colour = "white")

# Filtering, grouping, and finding an average arrival delay and delay count for each day of week
best_day_of_week <- flight %>%
  filter(ArrDelay > 0) %>%
  group_by(DayOfWeek) %>%
  summarise(avg_delay_in_min = round(mean(ArrDelay),2),
            delay_count = n()) %>%
  mutate(DayOfWeek = case_when(
    DayOfWeek == 1 ~ "Monday",
    DayOfWeek == 2 ~ "Tuesday",
    DayOfWeek == 3 ~ "Wednesday",
    DayOfWeek == 4 ~ "Thursday",
    DayOfWeek == 5 ~ "Friday",
    DayOfWeek == 6 ~ "Saturday",
    DayOfWeek == 7 ~ "Sunday"
  )) %>%
  arrange(avg_delay_in_min)

best_day_of_week$DayOfWeek <- factor(best_day_of_week$DayOfWeek, levels = c("Sunday", "Saturday", "Friday", "Thursday", "Wednesday", "Tuesday", "Monday"))

best_day_of_week
## # A tibble: 7 x 3
##   DayOfWeek avg_delay_in_min delay_count
##   <fct>                <dbl>       <int>
## 1 Saturday              23.3      600173
## 2 Tuesday               25.5      755547
## 3 Wednesday             26.2      796757
## 4 Friday                26.6      883602
## 5 Thursday              27.0      860557
## 6 Sunday                27.2      750076
## 7 Monday                27.6      813071
# Plotting average flight delay for every day of week

best_day_of_week %>%
  ggplot() +
  aes(x = DayOfWeek,
      y = avg_delay_in_min,
      fill = DayOfWeek) +
  xlab(NULL) +
  ylab("Avg Delay (min)") +
  ylim(c(0,28)) +
  labs(title = "Average Flight Delay in 2003-2004",
       subtitle = "(per day)",
       caption = "Figure 2") +
  theme_classic() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold"),
        plot.subtitle = element_text(hjust = 0.5),
        plot.caption = element_text(hjust = 0.5),
        legend.position = "none") +
  geom_col() +
  geom_text(aes(label = avg_delay_in_min), hjust = 1.2) +
  coord_flip()

# Creating a new column called quarter to get the best time of the year

flight <- flight %>%
  mutate(quarter = case_when(
    Month %in% c(1,2,3) ~ "Q1",
    Month %in% c(4,5,6) ~ "Q2",
    Month %in% c(7,8,9) ~ "Q3",
    Month %in% c(10,11,12) ~ "Q4"
  ))
# Separate best time of year in 2003 and 2004

best_time_of_2003 <- flight %>%
  filter(ArrDelay > 0 & Year == 2003) %>%
  group_by(Year, quarter) %>%
  summarise(avg_delay_in_min = round(mean(ArrDelay),2),
            delay_count = n())
## `summarise()` has grouped output by 'Year'. You can override using the `.groups` argument.
best_time_of_2004 <- flight %>%
  filter(ArrDelay > 0 & Year == 2004) %>%
  group_by(Year, quarter) %>%
  summarise(avg_delay_in_min = round(mean(ArrDelay),2),
            delay_count = n())
## `summarise()` has grouped output by 'Year'. You can override using the `.groups` argument.
# Preview of best quarter in 2003

best_time_of_2003
## # A tibble: 4 x 4
## # Groups:   Year [1]
##    Year quarter avg_delay_in_min delay_count
##   <int> <chr>              <dbl>       <int>
## 1  2003 Q1                  24.9      596997
## 2  2003 Q2                  22.9      557582
## 3  2003 Q3                  27.1      625177
## 4  2003 Q4                  24.1      661585
# Preview of best quarter in 2004

best_time_of_2004
## # A tibble: 4 x 4
## # Groups:   Year [1]
##    Year quarter avg_delay_in_min delay_count
##   <int> <chr>              <dbl>       <int>
## 1  2004 Q1                  26.4      737006
## 2  2004 Q2                  28.9      760521
## 3  2004 Q3                  28.2      726691
## 4  2004 Q4                  27.0      794224
# Creating a line graph for best time in 2003 and 2004

ggplot(best_time_of_2003, aes(x = quarter,
      y = avg_delay_in_min,
      group = 1)) +
  geom_line(color = "darkblue") +
  geom_line(aes(best_time_of_2004$quarter, best_time_of_2004$avg_delay_in_min, group = 1), color = "darkred", linetype = "dashed") +
  xlab(NULL) +
  ylab("Avg Delay (min)") +
  labs(title = "Average Flight Delay in 2003-2004",
       subtitle = "(quarterly)",
       caption = "Figure 3") +
  geom_text(aes(label = avg_delay_in_min), hjust = 1.2) +
  geom_text(aes(best_time_of_2004$quarter, best_time_of_2004$avg_delay_in_min, label = best_time_of_2004$avg_delay_in_min), hjust = 1.2) +
  theme_classic() +
  theme(plot.title = element_text(hjust = 0.5, face = "bold"),
        plot.subtitle = element_text(hjust = 0.5),
        plot.caption = element_text(hjust = 0.5)) +
  scale_color_manual(name="Year",
                     breaks=c("2003","2004"),
                     values=c('2003'='darkblue', '2004'='darkred'))

Question 2

Do older planes suffer more delays?

# Removing NAs

plane <- na.omit(plane)
head(plane)
##    tailnum        type     manufacturer issue_date     model status
## 35  N10156 Corporation          EMBRAER 02/13/2004 EMB-145XR  Valid
## 36  N102UW Corporation AIRBUS INDUSTRIE 05/26/1999  A320-214  Valid
## 37  N10323 Corporation           BOEING 07/01/1997   737-3TO  Valid
## 38  N103US Corporation AIRBUS INDUSTRIE 06/18/1999  A320-214  Valid
## 39  N104UA Corporation           BOEING 01/26/1998   747-422  Valid
## 40  N104UW Corporation AIRBUS INDUSTRIE 07/02/1999  A320-214  Valid
##              aircraft_type engine_type year
## 35 Fixed Wing Multi-Engine   Turbo-Fan 2004
## 36 Fixed Wing Multi-Engine   Turbo-Fan 1998
## 37 Fixed Wing Multi-Engine   Turbo-Jet 1986
## 38 Fixed Wing Multi-Engine   Turbo-Fan 1999
## 39 Fixed Wing Multi-Engine   Turbo-Fan 1998
## 40 Fixed Wing Multi-Engine   Turbo-Fan 1999
# Merge the flight and plane table to see the average delay of every plane sorted by its manufacturing year

older_delays <- flight %>%
  left_join(plane, by = c("TailNum" = "tailnum")) %>%
  filter(ArrDelay > 0) %>%
  group_by(year, TailNum) %>%
  summarise(avg_delay = mean(ArrDelay),
            delay_counts = n()) %>%
  arrange(year)
## `summarise()` has grouped output by 'year'. You can override using the `.groups` argument.
head(older_delays)
## # A tibble: 6 x 4
## # Groups:   year [4]
##   year  TailNum avg_delay delay_counts
##   <chr> <chr>       <dbl>        <int>
## 1 0000  N235SW       16.6         1745
## 2 0000  N384AE       23.8          385
## 3 1956  N381AA       27.9          278
## 4 1957  N3744D       19.9          801
## 5 1959  N201AA       29.5          852
## 6 1959  N567AA       31.1          838
# Filtering valid manufacturing years for planes that fly in 2003-2004

older_delays <- na.omit(older_delays) %>%
  filter(year != "None" & year != "0000" & year <= "2004")

older_delays$year <- as.integer(older_delays$year)
head(older_delays)
## # A tibble: 6 x 4
## # Groups:   year [5]
##    year TailNum avg_delay delay_counts
##   <int> <chr>       <dbl>        <int>
## 1  1956 N381AA       27.9          278
## 2  1957 N3744D       19.9          801
## 3  1959 N201AA       29.5          852
## 4  1959 N567AA       31.1          838
## 5  1962 N421AA       32.1          960
## 6  1963 N378AA       30.0          297
# Create a Scatter Plot for every plane in every year; If the older planes suffer more delay, there should be a negative linear relationship. However it does not show like that.

older_delays %>%
  ggplot() +
  aes(x = year,
      y = avg_delay,
      color = year) +
  scale_color_steps(low = "#9D43C2", high = "#482158") +
  xlab(NULL) +
  ylab("Avg Delay (Min)") +
  labs(title = "Scatter Plot of Average Flight Delay by Year",
       caption = "Figure 5") +
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
        plot.title = element_text(hjust = 0.5, face = "bold"),
        plot.caption = element_text(hjust = 0.5),
        legend.position = "none") +
  geom_jitter()

Question 3

How does the number of people flying between different locations change over time?

# Creating a new column that combines origin and destination as a route

flight <- flight %>%
  mutate(org_dest = paste(Origin, Dest, sep = "-"))
# Counting the number of flights per origin-destination, separated by years

org_dest_2003 <- flight %>%
  filter(Year == 2003) %>%
  group_by(org_dest) %>%
  summarise(total_flight_2003 = n()) %>%
  arrange()

org_dest_2004 <- flight %>%
  filter(Year == 2004) %>%
  group_by(org_dest) %>%
  summarise(total_flight_2004 = n()) %>%
  arrange()
# Taking the difference between both years, then retrieve the result which has the greatest positive & negative change

more_2003 <- org_dest_2003 %>%
  inner_join(org_dest_2004, by = c("org_dest" = "org_dest")) %>%
  mutate(difference = total_flight_2004-total_flight_2003,
         perc_change = round((difference/total_flight_2003),6)) %>%
  arrange(desc(perc_change)) %>%
  head(10)

more_2004 <- org_dest_2003 %>%
  inner_join(org_dest_2004, by = c("org_dest" = "org_dest")) %>%
  mutate(difference = total_flight_2004-total_flight_2003,
         perc_change = round((difference/total_flight_2003),6)) %>%
  arrange(perc_change) %>%
  head(10)
more_2003
## # A tibble: 10 x 5
##    org_dest total_flight_2003 total_flight_2004 difference perc_change
##    <chr>                <int>             <int>      <int>       <dbl>
##  1 TLH-IAH                  1               464        463       463  
##  2 RSW-MCO                  1               346        345       345  
##  3 SGF-MEM                  1               305        304       304  
##  4 DFW-PIA                  2               334        332       166  
##  5 OKC-DTW                  2               205        203       102. 
##  6 DFW-MYR                  1                90         89        89  
##  7 PHL-AVP                  3               252        249        83  
##  8 AVP-PHL                  4               258        254        63.5
##  9 CVG-CAK                 41              2571       2530        61.7
## 10 CAK-CVG                 41              2546       2505        61.1
more_2004
## # A tibble: 10 x 5
##    org_dest total_flight_2003 total_flight_2004 difference perc_change
##    <chr>                <int>             <int>      <int>       <dbl>
##  1 CDC-SGU                128                 1       -127      -0.992
##  2 STL-SAT               1129                14      -1115      -0.988
##  3 SAT-STL               1127                14      -1113      -0.988
##  4 DTW-SJC                307                 4       -303      -0.987
##  5 SLC-AUS               1044                15      -1029      -0.986
##  6 SLC-SAT                663                10       -653      -0.985
##  7 SJC-DTW                306                 5       -301      -0.984
##  8 TUL-OKC                364                 6       -358      -0.984
##  9 AUS-SLC               1036                18      -1018      -0.983
## 10 OKC-TUL                365                 7       -358      -0.981
# Comparing by Origin
origin_03 <- flight %>%
  filter(Year == 2003) %>%
  group_by(Origin) %>%
  summarise(total_flight_03 = n())

origin_04 <- flight %>%
  filter(Year == 2004) %>%
  group_by(Origin) %>%
  summarise(total_flight_04 = n())

origin_comparison <- origin_03 %>%
  inner_join(origin_04, by = c("Origin" = "Origin")) %>%
  left_join(airports, by = c("Origin" = "iata")) %>%
  select(Origin, long, lat, total_flight_03, total_flight_04) %>%
  mutate(perc_diff = round(((total_flight_04-total_flight_03)/total_flight_03),2)) %>%
  arrange(perc_diff)

head(origin_comparison,10)
## # A tibble: 10 x 6
##    Origin   long   lat total_flight_03 total_flight_04 perc_diff
##    <chr>   <dbl> <dbl>           <int>           <int>     <dbl>
##  1 DUT    -167.   53.9             511              37     -0.93
##  2 DRO    -108.   37.2            1519             444     -0.71
##  3 FMN    -108.   36.7               2               1     -0.5 
##  4 STL     -90.4  38.7           99485           60308     -0.39
##  5 STX     -64.8  17.7             622             399     -0.36
##  6 EFD     -95.2  29.6            1537            1034     -0.33
##  7 ILE     -97.7  31.1            4366            2904     -0.33
##  8 MEI     -88.8  32.3            1506            1032     -0.31
##  9 WYS    -111.   44.7             321             221     -0.31
## 10 CDC    -113.   37.7            1048             737     -0.3
tail(origin_comparison)
## # A tibble: 6 x 6
##   Origin   long   lat total_flight_03 total_flight_04 perc_diff
##   <chr>   <dbl> <dbl>           <int>           <int>     <dbl>
## 1 LNY    -157.   20.8              57             210      2.68
## 2 MKK    -157.   21.2              57             210      2.68
## 3 AVP     -75.7  41.3             806            3026      2.75
## 4 ITO    -155.   19.7             357            2520      6.06
## 5 ACY     -74.6  39.5             127            1056      7.31
## 6 ERI     -80.2  42.1              59            1033     16.5
# Comparing by Destination
dest_03 <- flight %>%
  filter(Year == 2003) %>%
  group_by(Dest) %>%
  summarise(total_flight_03 = n())

dest_04 <- flight %>%
  filter(Year == 2004) %>%
  group_by(Dest) %>%
  summarise(total_flight_04 = n())

dest_comparison <- dest_03 %>%
  inner_join(dest_04, by = c("Dest" = "Dest")) %>%
  left_join(airports, by = c("Dest" = "iata")) %>%
  select(Dest, long, lat, total_flight_03, total_flight_04) %>%
  mutate(perc_diff = round(((total_flight_04-total_flight_03)/total_flight_03),2)) %>%
  arrange(perc_diff)

head(dest_comparison,10)
## # A tibble: 10 x 6
##    Dest    long   lat total_flight_03 total_flight_04 perc_diff
##    <chr>  <dbl> <dbl>           <int>           <int>     <dbl>
##  1 DUT   -167.   53.9             522              38     -0.93
##  2 DRO   -108.   37.2            1521             449     -0.7 
##  3 STL    -90.4  38.7           99481           60339     -0.39
##  4 STX    -64.8  17.7             625             399     -0.36
##  5 ILE    -97.7  31.1            4374            2906     -0.34
##  6 EFD    -95.2  29.6            1533            1032     -0.33
##  7 CDC   -113.   37.7            1048             728     -0.31
##  8 MEI    -88.8  32.3            1516            1044     -0.31
##  9 WYS   -111.   44.7             322             225     -0.3 
## 10 GTR    -88.6  33.5            1931            1379     -0.29
tail(dest_comparison)
## # A tibble: 6 x 6
##   Dest    long   lat total_flight_03 total_flight_04 perc_diff
##   <chr>  <dbl> <dbl>           <int>           <int>     <dbl>
## 1 MKK   -157.   21.2              57             209      2.67
## 2 LNY   -157.   20.8              60             222      2.7 
## 3 AVP    -75.7  41.3             816            3037      2.72
## 4 ITO   -155.   19.7             357            2523      6.07
## 5 ACY    -74.6  39.5             126            1060      7.41
## 6 ERI    -80.2  42.1              61            1038     16.0
# Plotting the difference for origin
geo <- list(
  scope = 'usa',
  projection = list(type = 'world'),
  showland = TRUE,
  landcolor = toRGB("gray95"),
  countrycolor = toRGB("gray80")
)

plot_geo(locationmode = 'USA-states') %>% 
  add_markers(
    data=origin_comparison, x = ~long, y = ~lat, text = ~paste("Origin :", Origin,
                                                               "<br> Difference (decimal) :", perc_diff),
    color = ~perc_diff, hoverinfo = "text",alpha = 0.7, marker = list(size = 10)) %>%
   layout(
    title = 'Origin Comparison between 2003-2004',
    geo = geo
  )
# Plotting the difference for destination
plot_geo(locationmode = 'USA-states') %>% 
  add_markers(
    data = dest_comparison, x = ~long, y = ~lat, text = ~paste("Destination :", Dest,
                                                               "<br> Difference (decimal):", perc_diff),
    color = ~perc_diff, hoverinfo = "text", alpha = 0.7, marker = list(size = 10)) %>%
   layout(
    title = 'Destination Comparison between 2003-2004',
    geo = geo
  )

Question 4

Can you detect cascading failures as delays in one airport create delays in others?

# Retrieving the top 5 routes

flight %>%
  group_by(org_dest) %>%
  summarise(counts = n()) %>%
  arrange(desc(counts)) %>%
  top_n(5)
## Selecting by counts
## # A tibble: 5 x 2
##   org_dest counts
##   <chr>     <int>
## 1 LAX-SAN   29588
## 2 SAN-LAX   29524
## 3 BOS-LGA   25063
## 4 LGA-BOS   24561
## 5 LGA-DCA   24327
# Taking example from airports that have the most flights

flight %>%
  filter(ArrDelay > 60 & (Origin %in% c("LAX","SAN") | Dest %in% c("LAX", "SAN"))) %>%
  select(Year, Month, DayofMonth, TailNum, CRSDepTime, DepTime, CRSArrTime, ArrTime, DepDelay, ArrDelay, Origin, Dest) %>%
  arrange(Year, Month, DayofMonth, TailNum, DepTime) %>%
  head(10)
##    Year Month DayofMonth TailNum CRSDepTime DepTime CRSArrTime ArrTime DepDelay
## 1  2003     1          1   N1608       1020    1131       1735    1840       71
## 2  2003     1          1  N299SW       2327    2313         11    2351        0
## 3  2003     1          1  N302AA       2245    2357        620     754       72
## 4  2003     1          1  N339MQ       1310    1509       1405    1610      119
## 5  2003     1          1    N342       1630    1755       1740    1858       85
## 6  2003     1          1    N342       1810    1925       1910    2025       75
## 7  2003     1          1  N37277       1415    1524       2210    2327       69
## 8  2003     1          1  N450UA        815    1018       1028    1210      123
## 9  2003     1          1  N450UA       1124    1257       1715    1839       93
## 10 2003     1          1  N514UA       1605    1935       1913    2205      210
##    ArrDelay Origin Dest
## 1        65    LAX  ATL
## 2      1420    LAX  PSP
## 3        94    LAX  IAD
## 4       125    FAT  LAX
## 5        78    LAS  LAX
## 6        75    LAX  LAS
## 7        77    SAN  EWR
## 8       102    ORD  SAN
## 9        84    SAN  ORD
## 10      172    EWR  LAX

Question 5

Use the available variables to construct a model that predicts delays.

# Skimming the dataset
skimr::skim(flight)
Data summary
Name flight
Number of rows 13363418
Number of columns 32
_______________________
Column type frequency:
character 8
numeric 24
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
UniqueCarrier 0 1.0 2 2 0 19 0
TailNum 0 1.0 0 6 58 5830 0
Origin 0 1.0 3 3 0 288 0
Dest 0 1.0 3 3 0 283 0
CancellationCode 2619866 0.8 0 1 10743544 4 0
new_dep 0 1.0 11 12 0 8 0
quarter 0 1.0 2 2 0 4 0
org_dest 0 1.0 7 7 0 4852 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Year 0 1.0 2003.52 0.50 2003 2003 2004 2004 2004 ▇▁▁▁▇
Month 0 1.0 6.54 3.44 1 4 7 10 12 ▇▅▆▅▇
DayofMonth 0 1.0 15.75 8.79 1 8 16 23 31 ▇▇▇▇▆
DayOfWeek 0 1.0 3.94 1.99 1 2 4 6 7 ▇▅▅▅▇
DepTime 0 1.0 1345.61 469.78 1 938 1330 1732 2750 ▁▇▇▆▁
CRSDepTime 0 1.0 1340.35 460.10 0 935 1328 1725 2400 ▁▇▇▇▃
ArrTime 0 1.0 1500.32 488.91 1 1123 1524 1915 2955 ▁▆▇▇▁
CRSArrTime 0 1.0 1506.24 473.36 0 1128 1526 1910 2400 ▁▃▇▇▆
FlightNum 0 1.0 2041.70 1939.24 1 585 1403 2857 9912 ▇▂▁▁▁
ActualElapsedTime 0 1.0 122.91 70.16 -710 72 104 154 1777 ▁▇▁▁▁
CRSElapsedTime 0 1.0 124.39 69.33 -32 75 105 155 1441 ▇▁▁▁▁
AirTime 0 1.0 103.27 81.93 -3818 53 85 137 3508 ▁▁▇▁▁
ArrDelay 0 1.0 5.12 31.96 -1302 -10 -2 9 1879 ▁▁▇▁▁
DepDelay 0 1.0 6.60 28.11 -1410 -4 0 4 1882 ▁▁▇▁▁
Distance 0 1.0 714.99 569.48 8 305 547 946 4962 ▇▂▁▁▁
TaxiIn 0 1.0 7.47 41.22 0 4 5 7 1495 ▇▁▁▁▁
TaxiOut 0 1.0 15.67 13.13 0 10 13 18 3905 ▇▁▁▁▁
Cancelled 0 1.0 0.00 0.00 0 0 0 0 0 ▁▁▇▁▁
Diverted 0 1.0 0.00 0.00 0 0 0 0 0 ▁▁▇▁▁
CarrierDelay 2619866 0.8 2.55 16.17 0 0 0 0 1879 ▇▁▁▁▁
WeatherDelay 2619866 0.8 0.65 8.39 0 0 0 0 1230 ▇▁▁▁▁
NASDelay 2619866 0.8 3.38 14.73 -60 0 0 0 1385 ▇▁▁▁▁
SecurityDelay 2619866 0.8 0.02 1.21 0 0 0 0 533 ▇▁▁▁▁
LateAircraftDelay 2619866 0.8 3.22 16.24 0 0 0 0 1407 ▇▁▁▁▁
# First, we select only numerical variables that are available even before the flight departs (so it can be used for future predictions)

flight_selected <- flight %>%
  select(Month, DayofMonth, DayOfWeek, CRSDepTime, CRSArrTime, CRSElapsedTime, Distance, ArrDelay)
# Predicting ArrDelay
# Create a task before doing regression

task <- TaskRegr$new(id = "flight_selected", backend = flight_selected, target = 'ArrDelay')

# Take MSE as measure

measure <- msr("regr.mse")
# Splitting the data into train and test
set.seed(3005)
train_data <- sample(task$nrow, 0.7 * task$nrow)
test_data <- setdiff(seq_len(task$nrow), train_data)

Ridge Regression

# Set the learner to ridge regression, train the dataset and use it to predict the test dataset
learner_ridge <- lrn('regr.glmnet')
learner_ridge$param_set$values <- list(alpha = 0, lambda = 0.1)
glrn_ridge <- GraphLearner$new(learner_ridge)
glrn_ridge$train(task, row_ids = train_data)
# Ridge Train data result
glrn_ridge$predict(task, row_ids = train_data)$score()
## regr.mse 
## 1007.176
# Ridge Test data result
glrn_ridge$predict(task, row_ids = test_data)$score()
## regr.mse 
## 1005.817

LASSO Regression

# Set the learner to LASSO regression, train the dataset and use it to predict the test dataset
learner_lasso <- lrn('regr.glmnet')
learner_lasso$param_set$values <- list(alpha = 1, lambda = 0.1)
glrn_lasso <- GraphLearner$new(learner_lasso)
glrn_lasso$train(task, row_ids = train_data)
# LASSO Train data result
glrn_lasso$predict(task, row_ids = train_data)$score()
## regr.mse 
## 1007.222
# LASSO Test data result
glrn_lasso$predict(task, row_ids = test_data)$score()
## regr.mse 
## 1005.873

Sampling for random forest

# Take a sample of 50,000 rows
set.seed(2002)
flight_sample <- sample_n(flight, 50000)
# Selecting numerical variables from the sample dataset
flight_sample_selected <- flight_sample %>%
  select(Month, DayofMonth, DayOfWeek, CRSDepTime, CRSArrTime, CRSElapsedTime, Distance, ArrDelay)
# Replacing the task with the sample dataset
task <- TaskRegr$new(id = "flight_sample_selected", backend = flight_sample_selected, target = 'ArrDelay')
# Splitting the data into train and test
set.seed(3005)
train_data <- sample(task$nrow, 0.7 * task$nrow)
test_data <- setdiff(seq_len(task$nrow), train_data)

Random Forest

# Set the learner to random forest, train the dataset and use it to predict the test dataset
learner_rf <- lrn('regr.ranger')
learner_rf$param_set$values <- list(min.node.size = 4)
glrn_rf <- GraphLearner$new(learner_rf)
tuner <- tnr('grid_search')
terminator <- trm('evals', n_evals = 30)
tune_ntrees <- ParamSet$new(
  list(
    ParamInt$new('regr.ranger.num.trees', lower = 100, upper = 500)
  )
)
autotune_rf <- AutoTuner$new(
  learner = glrn_rf,
  resampling = rsmp('cv', folds = 5),
  measure = measure,
  search_space = tune_ntrees,
  terminator = terminator,
  tuner = tuner
)
autotune_rf$train(task, row_ids = train_data)
## INFO  [10:35:15.148] [bbotk] Starting to optimize 1 parameter(s) with '<TunerGridSearch>' and '<TerminatorEvals> [n_evals=30, k=0]' 
## INFO  [10:35:15.219] [bbotk] Evaluating 1 configuration(s) 
## INFO  [10:35:15.280] [mlr3] Running benchmark with 5 resampling iterations 
## INFO  [10:35:15.498] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 4/5) 
## INFO  [10:35:20.639] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 5/5) 
## INFO  [10:35:25.319] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 1/5) 
## INFO  [10:35:30.165] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 3/5) 
## INFO  [10:35:34.976] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 2/5) 
## INFO  [10:35:39.994] [mlr3] Finished benchmark 
## INFO  [10:35:40.087] [bbotk] Result of batch 1: 
## INFO  [10:35:40.096] [bbotk]  regr.ranger.num.trees regr.mse warnings errors runtime_learners 
## INFO  [10:35:40.096] [bbotk]                    233 1000.593        0      0            24.37 
## INFO  [10:35:40.096] [bbotk]                                 uhash 
## INFO  [10:35:40.096] [bbotk]  df97f0f5-532e-4052-9e04-a027907cf565 
## INFO  [10:35:40.099] [bbotk] Evaluating 1 configuration(s) 
## INFO  [10:35:40.212] [mlr3] Running benchmark with 5 resampling iterations 
## INFO  [10:35:40.237] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 4/5) 
## INFO  [10:35:47.073] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 5/5) 
## INFO  [10:35:54.617] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 1/5) 
## INFO  [10:36:01.667] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 3/5) 
## INFO  [10:36:09.071] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 2/5) 
## INFO  [10:36:16.570] [mlr3] Finished benchmark 
## INFO  [10:36:16.679] [bbotk] Result of batch 2: 
## INFO  [10:36:16.709] [bbotk]  regr.ranger.num.trees regr.mse warnings errors runtime_learners 
## INFO  [10:36:16.709] [bbotk]                    367 999.4707        0      0            36.22 
## INFO  [10:36:16.709] [bbotk]                                 uhash 
## INFO  [10:36:16.709] [bbotk]  f54343e2-4c2b-4686-bb72-8417528d09a2 
## INFO  [10:36:16.713] [bbotk] Evaluating 1 configuration(s) 
## INFO  [10:36:16.777] [mlr3] Running benchmark with 5 resampling iterations 
## INFO  [10:36:16.799] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 4/5) 
## INFO  [10:36:29.573] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 5/5) 
## INFO  [10:36:39.097] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 1/5) 
## INFO  [10:36:48.830] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 3/5) 
## INFO  [10:36:58.696] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 2/5) 
## INFO  [10:37:08.116] [mlr3] Finished benchmark 
## INFO  [10:37:08.202] [bbotk] Result of batch 3: 
## INFO  [10:37:08.206] [bbotk]  regr.ranger.num.trees regr.mse warnings errors runtime_learners 
## INFO  [10:37:08.206] [bbotk]                    456 999.2912        0      0             51.2 
## INFO  [10:37:08.206] [bbotk]                                 uhash 
## INFO  [10:37:08.206] [bbotk]  e75854b9-cf2a-485a-9774-758391d40dd7 
## INFO  [10:37:08.209] [bbotk] Evaluating 1 configuration(s) 
## INFO  [10:37:08.299] [mlr3] Running benchmark with 5 resampling iterations 
## INFO  [10:37:08.320] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 4/5) 
## INFO  [10:37:17.802] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 5/5) 
## INFO  [10:37:26.615] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 1/5) 
## INFO  [10:37:35.628] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 3/5) 
## INFO  [10:37:44.417] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 2/5) 
## INFO  [10:37:53.243] [mlr3] Finished benchmark 
## INFO  [10:37:53.364] [bbotk] Result of batch 4: 
## INFO  [10:37:53.370] [bbotk]  regr.ranger.num.trees regr.mse warnings errors runtime_learners 
## INFO  [10:37:53.370] [bbotk]                    411 999.4785        0      0            44.84 
## INFO  [10:37:53.370] [bbotk]                                 uhash 
## INFO  [10:37:53.370] [bbotk]  5a509288-0a59-422f-93e8-b9af5de7a041 
## INFO  [10:37:53.376] [bbotk] Evaluating 1 configuration(s) 
## INFO  [10:37:53.484] [mlr3] Running benchmark with 5 resampling iterations 
## INFO  [10:37:53.510] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 5/5) 
## INFO  [10:37:55.561] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 3/5) 
## INFO  [10:37:57.578] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 2/5) 
## INFO  [10:37:59.560] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 1/5) 
## INFO  [10:38:01.588] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 4/5) 
## INFO  [10:38:03.628] [mlr3] Finished benchmark 
## INFO  [10:38:03.740] [bbotk] Result of batch 5: 
## INFO  [10:38:03.746] [bbotk]  regr.ranger.num.trees regr.mse warnings errors runtime_learners 
## INFO  [10:38:03.746] [bbotk]                    100  1006.07        0      0               10 
## INFO  [10:38:03.746] [bbotk]                                 uhash 
## INFO  [10:38:03.746] [bbotk]  f247da3b-92ef-4a6c-909b-4779f392567a 
## INFO  [10:38:03.750] [bbotk] Evaluating 1 configuration(s) 
## INFO  [10:38:03.846] [mlr3] Running benchmark with 5 resampling iterations 
## INFO  [10:38:03.864] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 3/5) 
## INFO  [10:38:06.786] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 2/5) 
## INFO  [10:38:09.880] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 1/5) 
## INFO  [10:38:12.665] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 5/5) 
## INFO  [10:38:16.397] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 4/5) 
## INFO  [10:38:19.224] [mlr3] Finished benchmark 
## INFO  [10:38:19.334] [bbotk] Result of batch 6: 
## INFO  [10:38:19.340] [bbotk]  regr.ranger.num.trees regr.mse warnings errors runtime_learners 
## INFO  [10:38:19.340] [bbotk]                    144  1003.69        0      0            15.25 
## INFO  [10:38:19.340] [bbotk]                                 uhash 
## INFO  [10:38:19.340] [bbotk]  99996684-c03d-4a90-b708-b5a9eb06a793 
## INFO  [10:38:19.345] [bbotk] Evaluating 1 configuration(s) 
## INFO  [10:38:19.437] [mlr3] Running benchmark with 5 resampling iterations 
## INFO  [10:38:19.456] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 5/5) 
## INFO  [10:38:25.724] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 3/5) 
## INFO  [10:38:31.740] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 2/5) 
## INFO  [10:38:37.772] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 4/5) 
## INFO  [10:38:43.468] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 1/5) 
## INFO  [10:38:49.290] [mlr3] Finished benchmark 
## INFO  [10:38:49.410] [bbotk] Result of batch 7: 
## INFO  [10:38:49.416] [bbotk]  regr.ranger.num.trees regr.mse warnings errors runtime_learners 
## INFO  [10:38:49.416] [bbotk]                    278 999.7746        0      0            29.63 
## INFO  [10:38:49.416] [bbotk]                                 uhash 
## INFO  [10:38:49.416] [bbotk]  15fed893-79b9-46da-af52-85d83350c773 
## INFO  [10:38:49.461] [bbotk] Evaluating 1 configuration(s) 
## INFO  [10:38:49.566] [mlr3] Running benchmark with 5 resampling iterations 
## INFO  [10:38:49.586] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 4/5) 
## INFO  [10:39:00.432] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 2/5) 
## INFO  [10:39:11.459] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 5/5) 
## INFO  [10:39:21.732] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 3/5) 
## INFO  [10:39:32.432] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 1/5) 
## INFO  [10:39:42.250] [mlr3] Finished benchmark 
## INFO  [10:39:42.400] [bbotk] Result of batch 8: 
## INFO  [10:39:42.404] [bbotk]  regr.ranger.num.trees regr.mse warnings errors runtime_learners 
## INFO  [10:39:42.404] [bbotk]                    500 998.6334        0      0             52.5 
## INFO  [10:39:42.404] [bbotk]                                 uhash 
## INFO  [10:39:42.404] [bbotk]  bb0ec28c-3365-4691-9c81-fe15c35473e5 
## INFO  [10:39:42.407] [bbotk] Evaluating 1 configuration(s) 
## INFO  [10:39:42.469] [mlr3] Running benchmark with 5 resampling iterations 
## INFO  [10:39:42.482] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 4/5) 
## INFO  [10:39:49.033] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 2/5) 
## INFO  [10:39:55.035] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 5/5) 
## INFO  [10:40:03.525] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 3/5) 
## INFO  [10:40:12.041] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 1/5) 
## INFO  [10:40:18.357] [mlr3] Finished benchmark 
## INFO  [10:40:18.430] [bbotk] Result of batch 9: 
## INFO  [10:40:18.434] [bbotk]  regr.ranger.num.trees regr.mse warnings errors runtime_learners 
## INFO  [10:40:18.434] [bbotk]                    322 999.5913        0      0            35.77 
## INFO  [10:40:18.434] [bbotk]                                 uhash 
## INFO  [10:40:18.434] [bbotk]  ff8b78d4-627d-4660-baf5-5db393e4dd31 
## INFO  [10:40:18.436] [bbotk] Evaluating 1 configuration(s) 
## INFO  [10:40:18.487] [mlr3] Running benchmark with 5 resampling iterations 
## INFO  [10:40:18.500] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 2/5) 
## INFO  [10:40:22.345] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 5/5) 
## INFO  [10:40:25.925] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 3/5) 
## INFO  [10:40:29.495] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 1/5) 
## INFO  [10:40:33.020] [mlr3] Applying learner 'regr.ranger' on task 'flight_sample_selected' (iter 4/5) 
## INFO  [10:40:36.753] [mlr3] Finished benchmark 
## INFO  [10:40:36.836] [bbotk] Result of batch 10: 
## INFO  [10:40:36.840] [bbotk]  regr.ranger.num.trees regr.mse warnings errors runtime_learners 
## INFO  [10:40:36.840] [bbotk]                    189 1002.084        0      0             18.1 
## INFO  [10:40:36.840] [bbotk]                                 uhash 
## INFO  [10:40:36.840] [bbotk]  92511b9e-2358-4c6b-8dd9-659d4d0bb9b8 
## INFO  [10:40:36.856] [bbotk] Finished optimizing after 10 evaluation(s) 
## INFO  [10:40:36.858] [bbotk] Result: 
## INFO  [10:40:36.861] [bbotk]  regr.ranger.num.trees learner_param_vals  x_domain regr.mse 
## INFO  [10:40:36.861] [bbotk]                    500          <list[2]> <list[1]> 998.6334
# Random Forest Train data result
autotune_rf$predict(task, row_ids = train_data)$score()
## regr.mse 
## 232.4166
# Random Forest Test data result
autotune_rf$predict(task, row_ids = test_data)$score()
## regr.mse 
## 985.8778